# bidi_algorithm - A pure-python implementation of the unicode BIDI algorithm
# Copyright (C) 2010 Noam Yorav-Raphael <noamraph@gmail.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

"""
This class defines the function tag_text, which tags a text for
directionality, by using the relevant part of the Unicode BIDI algorithm
(see http://unicode.org/reports/tr9/).

It defines the function tag_text, which gets an iterator over characters, and
return an iterator over characters and the constants OPEN_L, CLOSE_L, OPEN_R
and CLOSE_R. These constants are inserted between the given characters, to
give the directionality structure of the text.

This module currently doesn't implement the algorithm for arabic characters,
since those require some special treatment.
"""

__all__ = ['tag_text', 'OPEN_L', 'CLOSE_L', 'OPEN_R', 'CLOSE_R']

from collections import deque
from cdata import cdata

class StateMachine(object):
    '''
    This class is the base class for the state machine classes defined here.

    The state machine implemented has a state, which is a string. In each
    step, the machine gets two arguments, and according to these and the
    state, it produces some output and changes the state.

    In this implementation, the input is taken from one iterable, and the
    output is given using the iterable protocol. When output is requested,
    by a call to the next() method, the given iterable's next() method is
    called to produce the two input arguments (arg1, arg2). Then, the method
    named "%s_%s" % (current_state, arg1) is called with arg2 as its argument.
    The method is expected to return an iterable over output elements, which
    will be given upon subsequent calls to the instance's next() method.
    '''

    def __init__(self, iterable):
        if not hasattr(self, 'state'):
            raise NotImplementedError, \
                  'You must implement an __init__ method that will initialize '\
                  'self.state.'
        
        self._input_source = iter(iterable)
        self._current_output_generator = iter([])
        self._finishing = False

    def next(self):
        while True:
            try:
                return self._current_output_generator.next()
            except StopIteration:
                if self._finishing:
                    raise
            # Make another call to the appropriate method.

            try:
                letter, input = self._input_source.next()
            except StopIteration:
                self._finishing = True
                self._current_output_generator = \
                    getattr(self, '%s_END' % self.state)()
                if self._current_output_generator is None:
                    self._current_output_generator = iter([])
                continue

            self._current_output_generator = \
                getattr(self, '%s_%s' % (self.state, letter))(input)
            if self._current_output_generator is None:
                self._current_output_generator = iter([])

    def __iter__(self):
        return self
    

# For us, there are six types of chars:
#
# L - strong left-to-right
# R - strong right-to-left
# N - neutral
# EN - european number
# S - number seperator
# T - number terminator

class BaseLTR(StateMachine):
    '''
    Tag text when the base is LTR.
    '''
    
    def __init__(self, iterable, openL, closeL, openR, closeR):
        self.state = 'L'
        
        self.queueN = deque()
        self.queueT = deque()
        self.S = None

        self.openL, self.closeL, self.openR, self.closeR = \
            openL, closeL, openR, closeR

        return StateMachine.__init__(self, iterable)
                       

    # L state - last strong character was L.

    def L_R(self, input):
        yield self.openR
        yield input
        self.state = 'R'

    def L_otherwise(self, input):
        yield input
    L_N = L_L = L_EN = L_T = L_S = L_otherwise

    def L_END(self):
        pass


    # R state - last strong char was R.
    # There are two queues in this state, one for N characters (which might
    # include ET chars not in the right), and one for ET chars.

    def R_R(self, input):
        while self.queueN:
            yield self.queueN.popleft()
        while self.queueT:
            yield self.queueT.popleft()

        yield input

    def R_L(self, input):
        yield self.closeR

        while self.queueN:
            yield self.queueN.popleft()
        while self.queueT:
            yield self.queueT.popleft()

        yield input

        self.state = 'L'

    def R_N(self, input):
        while self.queueT:
            self.queueN.append(self.queueT.popleft())

        self.queueN.append(input)

    R_S = R_N # A Number Seperator is like a neutral

    def R_T(self, input):
        self.queueT.append(input)

    def R_EN(self, input):
        while self.queueN:
            yield self.queueN.popleft()

        yield self.openL

        while self.queueT:
            yield self.queueT.popleft()

        yield input

        self.state = 'EN'

    def R_END(self):
        yield self.closeR
        
        while self.queueN:
            yield self.queueN.popleft()
        while self.queueT:
            yield self.queueT.popleft()


    # EN state - inside a number in an RTL context, the last char was not
    # a Number Seperator.

    def EN_EN(self, input):
        yield input

    def EN_T(self, input):
        # A Number Terminator in that place behaves exactly like a digit.
        yield input

    def EN_S(self, input):
        self.S = input
        self.state = 'S'

    def EN_R(self, input):
        yield self.closeL

        yield input

        self.state = 'R'

    def EN_L(self, input):
        yield self.closeL
        yield self.closeR

        yield input

        self.state = 'L'

    def EN_N(self, input):
        yield self.closeL

        self.queueN.append(input)
        self.state = 'R'

    def EN_END(self):
        yield self.closeL
        yield self.closeR


    # S state - after a number, a Number Seperator was given (it is in self.S).
    # Will it be a part of the number?

    def S_EN(self, input):
        yield self.S
        self.S = None

        yield input
        self.state = 'EN'

    def S_R(self, input):
        yield self.closeL
        yield self.S
        self.S = None
        yield input
        self.state = 'R'

    def S_L(self, input):
        yield self.closeL
        yield self.closeR

        yield self.S
        self.S = None

        yield input
        self.state = 'L'

    def S_N(self, input):
        yield self.closeL

        self.queueN.append(self.S)
        self.S = None
        
        self.queueN.append(input)
        self.state = 'R'

    S_S = S_N

    def S_T(self, input):
        yield self.closeL

        self.queueN.append(self.S)
        self.S = None

        self.queueT.append(input)
        self.state = 'R'
        
    def S_END(self):
        yield self.closeL
        yield self.closeR

        yield self.S
        self.S = None


class BaseRTL(StateMachine):
    '''
    Tag text when the base is RTL.
    '''
    
    def __init__(self, iterable, openL, closeL, openR, closeR):
        self.state = 'R'
        
        self.queue = deque()
        self.S = None

        self.openL, self.closeL, self.openR, self.closeR = \
            openL, closeL, openR, closeR

        return StateMachine.__init__(self, iterable)
                       

    # R state - last strong character was R. The queue may contain terminators
    # which may be part of a number.

    def R_continue(self, input):
        while self.queue:
            yield self.queue.popleft()
        
        yield input

    R_N = R_S = R_R = R_continue

    def R_T(self, input):
        self.queue.append(input)

    def R_L(self, input):
        while self.queue:
            yield self.queue.popleft()

        yield self.openL
        yield input
        self.state = 'L'

    def R_EN(self, input):
        yield self.openL

        while self.queue:
            yield self.queue.popleft()

        yield input
        self.state = 'ENR'

    def R_END(self):
        while self.queue:
            yield self.queue.popleft()


    # ENR state - last character was a number, and the strong character before
    # it was R.

    def ENR_EN(self, input):
        yield input

    ENR_T = ENR_EN

    def ENR_S(self, input):
        self.S = input
        self.state = 'SR'

    def ENR_R(self, input):
        yield self.closeL
        yield input
        self.state = 'R'

    ENR_N = ENR_R

    def ENR_L(self, input):
        # According to the Unicode algorithm, in an RTL part, numbers are
        # ordered together with LTR chars.
        yield input
        self.state = 'L'

    def ENR_END(self):
        yield self.closeL


    # SR state - last character was a seperator, which can be a part of a
    # number (it it saves in self.S), and we are in an RTL context.

    def SR_EN(self, input):
        yield self.S; self.S = None
        yield input
        self.state = 'ENR'

    def SR_back_to_R(self, input):
        yield self.closeL
        yield self.S; self.S = None
        yield input
        self.state = 'R'

    SR_S = SR_N = SR_R = SR_back_to_R

    def SR_T(self, input):
        yield self.closeL
        yield self.S; self.S = None
        self.queue.append(input)
        self.state = 'R'

    def SR_L(self, input):
        yield self.closeL
        yield self.S; self.S = None
        yield self.openL
        yield input
        self.state = 'L'

    def SR_END(self):
        yield self.closeL
        yield self.S; self.S = None


    # L state - last character was a strong right-to-left char.
    # Terminals for us are the same as simple Neutrals, so we fill only queue.

    def L_L(self, input):
        while self.queue:
            yield self.queue.popleft()

        yield input

    def L_neutral(self, input):
        self.queue.append(input)

    L_T = L_S = L_N = L_neutral

    def L_EN(self, input):
        while self.queue:
            yield self.queue.popleft()

        yield input
        
        self.state = 'ENL'

    def L_R(self, input):
        yield self.closeL

        while self.queue:
            yield self.queue.popleft()

        yield input

        self.state = 'R'
        
    def L_END(self):
        yield self.closeL

        while self.queue:
            yield self.queue.popleft()


    # ENL state - last character was a Number, in an LTR context.

    def ENL_L(self, input):
        yield input

        self.state = 'L'

    def ENL_N(self, input):
        self.queue.append(input)

        self.state = 'L'

    def ENL_expand_number(self, input):
        yield input

    ENL_EN = ENL_T = ENL_expand_number

    def ENL_S(self, input):
        self.S = input
        self.state = 'SL'

    def ENL_R(self, input):
        yield self.closeL
        yield input
        self.state = 'R'

    def ENL_END(self):
        yield self.closeL


    # SL state - last character was a seperator (saved in self.S), which came
    # after a number, and we are in LTR context.

    def SL_EN(self, input):
        yield self.S; self.S = None
        yield input

        self.state = 'ENL'

    def SL_L(self, input):
        yield self.S; self.S = None
        yield input

        self.state = 'L'

    def SL_neutral(self, input):
        self.queue.append(self.S); self.S = None
        self.queue.append(input)

        self.state = 'L'

    SL_N = SL_T = SL_S = SL_neutral

    def SL_R(self, input):
        yield self.closeL
        yield self.S; self.S = None
        yield input

        self.state = 'R'

    def SL_END(self):
        yield self.closeL
        yield self.S; self.S = None



class Constant(object):
    def __init__(self, name):
        self._name = name
    def __repr__(self):
        return self.name

OPEN_L = Constant('OPEN_L')
CLOSE_L = Constant('CLOSE_L')
OPEN_R = Constant('OPEN_R')
CLOSE_R = Constant('CLOSE_R')

def tag_text(text, is_ltr):
    def typer(s):
        for c in s:
            yield cdata[c], c
    if is_ltr:
        return BaseLTR(typer(text), OPEN_L, CLOSE_L, OPEN_R, CLOSE_R)
    else:
        return BaseRTL(typer(text), OPEN_L, CLOSE_L, OPEN_R, CLOSE_R)

